import os
import re
import sys
import time
import jieba
import random
import json
import platform
import jieba.posseg as pseg

from pymongo import MongoClient
from selenium import webdriver
from bs4 import BeautifulSoup

sys.setrecursionlimit(1000000)  # 例如这里设置为一百万

client = MongoClient('127.0.0.1', 27017, connect = False)

itjuzi_db = client['itjuzi_db']
itjuzi_coll = itjuzi_db['itjuzi_coll']


def new_driver(headless = False):
    options = webdriver.ChromeOptions()

    if headless:
        options.add_argument('--headless')
        options.add_argument('--disable-gpu')
        options.add_argument('--disable-images')

    desired_capabilities = options.to_capabilities()
    chrome_driver_path = '/Users/xuchaosheng/Workspace/cv_scrapy/libs/chromedriver'
    driver = webdriver.Chrome(chrome_driver_path, desired_capabilities = desired_capabilities)

    return driver


driver = new_driver()
driver.get('https://www.itjuzi.com/')
time.sleep(20)
driver.get('https://www.itjuzi.com/company')


def company_list():
    page_source = driver.execute_script("""
        return document.body.innerHTML;
    """)
    soup = BeautifulSoup(page_source, 'lxml')

    for item in soup.select('.juzi-table tr')[1:]:
        comp_id = item.find('a').get('href').split('/')[-1]

        if not itjuzi_coll.find_one({'comp_id': comp_id}):
            itjuzi_coll.insert_one({
                'comp_id': comp_id
            })
            print(comp_id)

    for i in range(100):
        try:
            driver.find_element_by_css_selector('.btn-next').click()
            break
        except:
            driver.execute_script("window.scrollTo(0,window.scrollY + 200)")

    time.sleep(3)
    company_list()


if __name__ == '__main__':
    company_list()
